{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "protective-stroke",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "from asr_deepspeech.etl import JSUTDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "forbidden-meeting",
   "metadata": {},
   "outputs": [],
   "source": [
    "landing = \"/FileStore/ASR/landing/jsut_ver1.1\"\n",
    "bronze = \"/FileStore/ASR/bronze/jsut_ver1.1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "elegant-mississippi",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter on the duration\n",
    "m, M = 6, 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "50d9439b-aedb-4385-af49-848655c90f86",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aa667920ed4540ea93edbdcc046ae4d8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Convert JSUT\n",
    "dataset = JSUTDataset(16000)\n",
    "dataset = dataset.run(landing, bronze)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "753937f3-cad6-4dc1-866d-f68552d4a64f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                       audio_filepath  duration     fq  \\\n",
      "10  /srv/lake/bronze/CADIC/cadic-asr-deepspeech/js...      6.07  16000   \n",
      "30  /srv/lake/bronze/CADIC/cadic-asr-deepspeech/js...      7.71  16000   \n",
      "40  /srv/lake/bronze/CADIC/cadic-asr-deepspeech/js...      6.21  16000   \n",
      "49  /srv/lake/bronze/CADIC/cadic-asr-deepspeech/js...      6.98  16000   \n",
      "61  /srv/lake/bronze/CADIC/cadic-asr-deepspeech/js...      7.27  16000   \n",
      "\n",
      "                                          text  text_size  \n",
      "10        ミツバチはにおいのサンプルを巣に持ち帰ることによって食糧のありかを伝える         36  \n",
      "30  分析においてはクルツによって最初に分析された権威的行動の類型に負っている部分が大きい         42  \n",
      "40       闇夜に鉄砲じゃあるまいしそんな場当たり的なやり方でうまくいくとは思えないよ         37  \n",
      "49       システィナ礼拝堂は１４７３年にバティカン宮殿内に建立された壮大な礼拝堂です         37  \n",
      "61   占星学は科学的な根拠もないのにとても人気があって占星学は信じている人が多いようです         41  \n"
     ]
    }
   ],
   "source": [
    "# Silver (cache)\n",
    "dataset_df = dataset.filter_duration(m, M)\n",
    "print(dataset_df.head())\n",
    "\n",
    "# Gold\n",
    "gold_root = \"/FileStore/ASR/gold\"\n",
    "gold_dir = f\"{gold_root}/jsut_{m}-{M}\"\n",
    "os.makedirs(gold_dir, exist_ok=True)\n",
    "dataset.export_labels(f\"{gold_dir}/labels.csv\")\n",
    "dtrain, dtest = train_test_split(dataset_df, test_size=0.1)\n",
    "dtrain.to_csv(f\"{gold_dir}/train.csv\", index=False)\n",
    "dtest.to_csv(f\"{gold_dir}/test.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be9c7f28-e0d4-45f7-9d74-cd9fdfa35d55",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
